from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import from_json
from pyspark.ml.feature import VectorAssembler
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
path_on_hdfs_to_your_data
df = spark.read.json("/user/rucio01/traces/traces.2018-04*")
# use this for calculating percentages
df.count()
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 200)
df.describe().toPandas()
split_col = split(df['dataset'], "\.")
df = df.withColumn('project', split_col.getItem(0))
df = df.withColumn('run_number', split_col.getItem(1))
df = df.withColumn('stream_name', split_col.getItem(2))
df = df.withColumn('prod_step', split_col.getItem(3))
df = df.withColumn('datatype', split_col.getItem(4))
df = df.withColumn('dataset_version', split_col.getItem(5))
df.createOrReplaceTempView('traces')
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT account, COUNT(*) as count\
FROM traces \
GROUP BY account \
ORDER BY count DESC)")
df_plot1 = df_ps.limit(10).toPandas()
df_plot1
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300
df_plot1['account'] = df_plot1.account.astype(str)
ax=sns.barplot(x="account", y="count", data=df_plot1, ci=None)
for index, row in df_plot1.iterrows():
ax.text(row.name,row['count']+3000000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT clientState, COUNT(*) as count\
FROM traces \
GROUP BY clientState \
ORDER BY count DESC)")
df_plot2 = df_ps.limit(10).toPandas()
df_plot2
df_plot2['clientState'] = df_plot2.clientState.astype(str)
ax=sns.barplot(x="clientState", y="count", data=df_plot2, ci=None)
for index, row in df_plot2.iterrows():
ax.text(row.name,row['count']+3000000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=30, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT dataset, COUNT(*) as count\
FROM traces \
GROUP BY dataset \
ORDER BY count DESC)")
df_plot3 = df_ps.limit(10).toPandas()
df_plot3
df_plot3['dataset'] = df_plot3.dataset.astype(str)
ax=sns.barplot(x="dataset", y="count", data=df_plot3, ci=None)
for index, row in df_plot3.iterrows():
ax.text(row.name,row['count']+800000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90, fontsize=7)
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT dataset_version, COUNT(*) as count\
FROM traces \
GROUP BY dataset_version \
ORDER BY count DESC)")
df_plot4 = df_ps.limit(10).toPandas()
df_plot4
df_plot4['dataset_version'] = df_plot4.dataset_version.astype(str)
ax=sns.barplot(x="dataset_version", y="count", data=df_plot4, ci=None)
for index, row in df_plot4.iterrows():
ax.text(row.name,row['count']+1000000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=30, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT datatype, COUNT(*) as count\
FROM traces \
GROUP BY datatype \
ORDER BY count DESC)")
df_plot5 = df_ps.limit(10).toPandas()
df_plot5
df_plot5['datatype'] = df_plot5.datatype.astype(str)
ax=sns.barplot(x="datatype", y="count", data=df_plot5, ci=None)
for index, row in df_plot5.iterrows():
ax.text(row.name,row['count']+1000000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=30, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT eventType, COUNT(*) as count\
FROM traces \
GROUP BY eventType \
ORDER BY count DESC)")
df_plot6 = df_ps.limit(10).toPandas()
df_plot6
df_plot6['eventType'] = df_plot6.eventType.astype(str)
ax=sns.barplot(x="eventType", y="count", data=df_plot6, ci=None)
for index, row in df_plot6.iterrows():
ax.text(row.name,row['count']+1000000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=30, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT filename, COUNT(*) as count\
FROM traces \
GROUP BY filename \
ORDER BY count DESC)")
df_plot7 = df_ps.limit(10).toPandas()
df_plot7
df_plot7['filename'] = df_plot7.filename.astype(str)
ax=sns.barplot(x="filename", y="count", data=df_plot7, ci=None)
for index, row in df_plot7.iterrows():
ax.text(row.name,row['count']+300000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90, fontsize=7,ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT prod_step, COUNT(*) as count\
FROM traces \
GROUP BY prod_step \
ORDER BY count DESC)")
df_plot8 = df_ps.limit(10).toPandas()
df_plot8
df_plot8['prod_step'] = df_plot8.prod_step.astype(str)
ax=sns.barplot(x="prod_step", y="count", data=df_plot8, ci=None)
for index, row in df_plot8.iterrows():
ax.text(row.name,row['count']+1000000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=30, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT project, COUNT(*) as count\
FROM traces \
GROUP BY project \
ORDER BY count DESC)")
df_plot9 = df_ps.limit(10).toPandas()
df_plot9
df_plot9['project'] = df_plot9.project.astype(str)
ax=sns.barplot(x="project", y="count", data=df_plot9, ci=None)
for index, row in df_plot9.iterrows():
ax.text(row.name,row['count']+1000000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=30, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT stream_name, COUNT(*) as count\
FROM traces \
GROUP BY stream_name \
ORDER BY count DESC)")
df_plot10 = df_ps.limit(10).toPandas()
df_plot10
df_plot10['stream_name'] = df_plot10.stream_name.astype(str)
ax=sns.barplot(x="stream_name", y="count", data=df_plot10, ci=None)
for index, row in df_plot10.iterrows():
ax.text(row.name,row['count']+1000000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=30, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT traceTimeentryUnix, COUNT(*) as count\
FROM traces \
GROUP BY traceTimeentryUnix \
ORDER BY count DESC)")
df_plot11 = df_ps.limit(10).toPandas()
df_plot11
#df_plot['traceTimeentryUnix'] = df_plot.traceTimeentryUnix.astype(str)
#ax=sns.barplot(x="traceTimeentryUnix", y="count", data=df_plot, ci=None)
#for index, row in df_plot.iterrows():
# ax.text(row.name,row['count']+3000000, "%s%%" % row.percentage , color='black', ha="center")
#ax.set(ylabel='count')
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT uuid, COUNT(*) as count\
FROM traces \
GROUP BY uuid \
ORDER BY count DESC)")
df_plot12 = df_ps.limit(10).toPandas()
df_plot12
df_plot12['uuid'] = df_plot12.uuid.astype(str)
ax=sns.barplot(x="uuid", y="count", data=df_plot12, ci=None)
for index, row in df_plot12.iterrows():
ax.text(row.name,row['count']+10000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=30, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT guid, COUNT(*) as count\
FROM traces \
GROUP BY guid \
ORDER BY count DESC)")
df_plot13 = df_ps.limit(10).toPandas()
df_plot13
df_plot13['guid'] = df_plot13.guid.astype(str)
ax=sns.barplot(x="guid", y="count", data=df_plot13, ci=None)
for index, row in df_plot13.iterrows():
ax.text(row.name,row['count']+1000000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=30, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT hostname, COUNT(*) as count\
FROM traces \
GROUP BY hostname \
ORDER BY count DESC)")
df_plot14 = df_ps.limit(10).toPandas()
df_plot14
df_plot14['hostname'] = df_plot14.hostname.astype(str)
ax=sns.barplot(x="hostname", y="count", data=df_plot14, ci=None)
for index, row in df_plot14.iterrows():
ax.text(row.name,row['count']+200000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=30, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT ip, COUNT(*) as count\
FROM traces \
GROUP BY ip \
ORDER BY count DESC)")
df_plot15 = df_ps.limit(10).toPandas()
df_plot15
df_plot15['ip'] = df_plot15.ip.astype(str)
ax=sns.barplot(x="ip", y="count", data=df_plot15, ci=None)
for index, row in df_plot15.iterrows():
ax.text(row.name,row['count']+1000000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=30, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT localsite, COUNT(*) as count\
FROM traces \
GROUP BY localsite \
ORDER BY count DESC)")
df_plot16 = df_ps.limit(10).toPandas()
df_plot16
df_plot16['localsite'] = df_plot16.localsite.astype(str)
ax=sns.barplot(x="localsite", y="count", data=df_plot16, ci=None)
for index, row in df_plot16.iterrows():
ax.text(row.name,row['count']+500000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=30, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT name, COUNT(*) as count\
FROM traces \
GROUP BY name \
ORDER BY count DESC)")
df_plot17 = df_ps.limit(10).toPandas()
df_plot17
df_plot17['name'] = df_plot17.name.astype(str)
ax=sns.barplot(x="name", y="count", data=df_plot17, ci=None)
for index, row in df_plot17.iterrows():
ax.text(row.name,row['count']+3000000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT pq, COUNT(*) as count\
FROM traces \
GROUP BY pq \
ORDER BY count DESC)")
df_plot18 = df_ps.limit(10).toPandas()
df_plot18
df_plot18['pq'] = df_plot18.pq.astype(str)
ax=sns.barplot(x="pq", y="count", data=df_plot18, ci=None)
for index, row in df_plot18.iterrows():
ax.text(row.name,row['count']+1000000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=30, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT protocol, COUNT(*) as count\
FROM traces \
GROUP BY protocol \
ORDER BY count DESC)")
df_plot19 = df_ps.limit(10).toPandas()
df_plot19
df_plot19['protocol'] = df_plot19.protocol.astype(str)
ax=sns.barplot(x="protocol", y="count", data=df_plot19, ci=None)
for index, row in df_plot19.iterrows():
ax.text(row.name,row['count']+500000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT remoteSite, COUNT(*) as count\
FROM traces \
GROUP BY remoteSite \
ORDER BY count DESC)")
df_plot20 = df_ps.limit(10).toPandas()
df_plot20
df_plot20['remoteSite'] = df_plot20.remoteSite.astype(str)
ax=sns.barplot(x="remoteSite", y="count", data=df_plot20, ci=None)
for index, row in df_plot20.iterrows():
ax.text(row.name,row['count']+200000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=30, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT scope, COUNT(*) as count\
FROM traces \
GROUP BY scope \
ORDER BY count DESC)")
df_plot21 = df_ps.limit(10).toPandas()
df_plot21
df_plot21['scope'] = df_plot21.scope.astype(str)
ax=sns.barplot(x="scope", y="count", data=df_plot21, ci=None)
for index, row in df_plot21.iterrows():
ax.text(row.name,row['count']+600000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=30, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT stateReason, COUNT(*) as count\
FROM traces \
GROUP BY stateReason \
ORDER BY count DESC)")
df_plot22 = df_ps.limit(10).toPandas()
df_plot22
df_plot22['stateReason'] = df_plot22.stateReason.astype(str)
ax=sns.barplot(x="stateReason", y="count", data=df_plot22, ci=None)
for index, row in df_plot22.iterrows():
ax.text(row.name,row['count']+3000000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT traceId, COUNT(*) as count\
FROM traces \
GROUP BY traceId \
ORDER BY count DESC)")
df_plot23 = df_ps.limit(10).toPandas()
df_plot23
#df_plot23['traceId'] = df_plot23.traceId.astype(str)
#ax=sns.barplot(x="traceId", y="count", data=df_plot23, ci=None)
#for index, row in df_plot23.iterrows():
# ax.text(row.name,row['count']+3000000, "%s%%" % row.percentage , color='black', ha="center")
#ax.set(ylabel='count')
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT traceIp, COUNT(*) as count\
FROM traces \
GROUP BY traceIp \
ORDER BY count DESC)")
df_plot24 = df_ps.limit(10).toPandas()
df_plot24
df_plot24['traceIp'] = df_plot24.traceIp.astype(str)
ax=sns.barplot(x="traceIp", y="count", data=df_plot24, ci=None)
for index, row in df_plot24.iterrows():
ax.text(row.name,row['count']+200000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=30, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT url, COUNT(*) as count\
FROM traces \
GROUP BY url \
ORDER BY count DESC)")
df_plot25 = df_ps.limit(10).toPandas()
df_plot25
df_plot25['url'] = df_plot25.url.astype(str)
ax=sns.barplot(x="url", y="count", data=df_plot25, ci=None)
for index, row in df_plot25.iterrows():
ax.text(row.name,row['count']+1000000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90, ha="right", fontsize=7)
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT usr, COUNT(*) as count\
FROM traces \
GROUP BY usr \
ORDER BY count DESC)")
df_plot26 = df_ps.limit(10).toPandas()
df_plot26
df_plot26['usr'] = df_plot26.usr.astype(str)
ax=sns.barplot(x="usr", y="count", data=df_plot26, ci=None)
for index, row in df_plot26.iterrows():
ax.text(row.name,row['count']+1000000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=30, ha="right")
df_ps = spark.sql("SELECT *, ROUND((count/244917469)*100,2) as percentage FROM(\
SELECT usrdn, COUNT(*) as count\
FROM traces \
GROUP BY usrdn \
ORDER BY count DESC)")
df_plot27 = df_ps.limit(10).toPandas()
df_plot27
df_plot27['usrdn'] = df_plot27.usrdn.astype(str)
ax=sns.barplot(x="usrdn", y="count", data=df_plot27, ci=None)
for index, row in df_plot27.iterrows():
ax.text(row.name,row['count']+700000, "%s%%" % row.percentage , color='black', ha="center")
ax.set(ylabel='count')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90, ha="right", fontsize=7)